{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fff2b90-e03f-4bb3-b533-8abb12e85b72",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
     ]
    }
   ],
   "source": [
    "## imports \n",
    "\n",
    "import pandas as pd\n",
    "import nltk\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from nltk import pos_tag, word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.corpus import treebank\n",
    "from nltk import pos_tag\n",
    "import string\n",
    "import warnings\n",
    "from collections import Counter\n",
    "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification\n",
    "import torch\n",
    "import torch.nn.functional as F\n",
    "\n",
    "from nltk.help import upenn_tagset\n",
    "\n",
    "nltk.download('punkt')\n",
    "nltk.download('tagsets')\n",
    "nltk.download('averaged_perceptron_tagger')\n",
    "nltk.download('punkt')\n",
    "nltk.download('treebank')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f106d35f-4a10-4dc1-a279-4d60de58be25",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loading in corpus csv\n",
    "\n",
    "master_df = pd.read_csv(\"data/whole_corpus.csv\", index_col=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc38878e-593e-4c65-8a23-577f4879dd57",
   "metadata": {},
   "source": [
    "# Mean sentence length #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9910e223-f315-4b19-b8f1-af6e359cc6f7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['_Chapter I_   JEAN MUIR  \"Has she come?\"', '\"No, Mamma, not yet.\"', '\"I wish it were well over.', 'The thought of it worries and excites me.', 'A cushion for my back, Bella.\"', 'And poor, peevish Mrs. Coventry sank into an easy chair with a nervous sigh and the air of a martyr, while her pretty daughter hovered about her with affectionate solicitude.', '\"Who are they talking of, Lucia?\"', 'asked the languid young man lounging on a couch near his cousin, who bent over her tapestry work with a happy smile on her usually haughty face.', '\"The new governess, Miss Muir.', 'Shall I tell you about her?\"', '\"No, thank you.', 'I have an inveterate aversion to the whole tribe.', 'I\\'ve often thanked heaven that I had but one sister, and she a spoiled child, so that I have escaped the infliction of a governess so long.\"', '\"How will you bear it now?\"']\n"
     ]
    }
   ],
   "source": [
    "# Prepping nltk punctuation list \n",
    "\n",
    "punc_list = list(string.punctuation)\n",
    "punc_list.append('``')\n",
    "punc_list.append('\"')\n",
    "punc_list.append(\"''\")\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f126ffd0-2426-45d8-b3dc-bda4acd97bc3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 10000 sentences out of 86882\n",
      "Processed 20000 sentences out of 86882\n",
      "Processed 30000 sentences out of 86882\n",
      "Processed 40000 sentences out of 86882\n",
      "Processed 50000 sentences out of 86882\n",
      "Processed 60000 sentences out of 86882\n",
      "Processed 70000 sentences out of 86882\n",
      "Processed 80000 sentences out of 86882\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>category</th>\n",
       "      <th>author</th>\n",
       "      <th>book/prompt</th>\n",
       "      <th>model</th>\n",
       "      <th>nation</th>\n",
       "      <th>gender</th>\n",
       "      <th>race</th>\n",
       "      <th>mean_sen_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>_Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>10.785714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>asked Lucia. \"Leave the house while she is in ...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>16.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>She is a nice person, I dare say, and when onc...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>14.6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                               text   category  author  \\\n",
       "0   1  _Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...  authentic  alcott   \n",
       "1   2  asked Lucia. \"Leave the house while she is in ...  authentic  alcott   \n",
       "2   3  She is a nice person, I dare say, and when onc...  authentic  alcott   \n",
       "\n",
       "     book/prompt      model    nation  gender   race mean_sen_len  \n",
       "0  behind a mask  authentic  American  female  white    10.785714  \n",
       "1  behind a mask  authentic  American  female  white         16.0  \n",
       "2  behind a mask  authentic  American  female  white         14.6  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Calculating mean sentence length\n",
    "\n",
    "master_df[\"mean_sen_len\"] = None\n",
    "\n",
    "for index, row in master_df.iterrows():\n",
    "    text = row[\"text\"]\n",
    "    sentences = sent_tokenize(text)\n",
    "    no_sentences = len(sentences)\n",
    "    tokens = word_tokenize(text)\n",
    "    tokens = [token for token in tokens if token not in punc_list]\n",
    "    word_count = len(tokens)\n",
    "    mean_len = word_count/no_sentences\n",
    "    master_df.at[index, \"mean_sen_len\"] = mean_len\n",
    "\n",
    "    if (index + 1) % 10000 == 0:\n",
    "        print(f\"Processed {index + 1} sentences out of {len(master_df)}\")\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a699b39e-e0b6-42aa-a4a5-ca1e32d31ee1",
   "metadata": {},
   "source": [
    "# Sentiment Scores #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "79058103-15c8-4357-8dff-d50697a8b041",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Downloading model for sentiment scores and creating function for sentiment analysis\n",
    "\n",
    "model_name = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "tokenizer = DistilBertTokenizer.from_pretrained(model_name)\n",
    "model = DistilBertForSequenceClassification.from_pretrained(model_name)\n",
    "\n",
    "def sentiment_score(text):\n",
    "    inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, padding=True)\n",
    "    outputs = model(**inputs)\n",
    "    logits = outputs.logits\n",
    "    probabilities = F.softmax(logits, dim=-1)\n",
    "    score = probabilities[0][1] - probabilities[0][0]\n",
    "    return score.item()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "4aca8931-6771-4a43-b1a4-db327e7eea9e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 1000 sentences out of 86882\n",
      "Processed 2000 sentences out of 86882\n",
      "Processed 3000 sentences out of 86882\n",
      "Processed 4000 sentences out of 86882\n",
      "Processed 5000 sentences out of 86882\n",
      "Processed 6000 sentences out of 86882\n",
      "Processed 7000 sentences out of 86882\n",
      "Processed 8000 sentences out of 86882\n",
      "Processed 9000 sentences out of 86882\n",
      "Processed 10000 sentences out of 86882\n",
      "Processed 11000 sentences out of 86882\n",
      "Processed 12000 sentences out of 86882\n",
      "Processed 13000 sentences out of 86882\n",
      "Processed 14000 sentences out of 86882\n",
      "Processed 15000 sentences out of 86882\n",
      "Processed 16000 sentences out of 86882\n",
      "Processed 17000 sentences out of 86882\n",
      "Processed 18000 sentences out of 86882\n",
      "Processed 19000 sentences out of 86882\n",
      "Processed 20000 sentences out of 86882\n",
      "Processed 21000 sentences out of 86882\n",
      "Processed 22000 sentences out of 86882\n",
      "Processed 23000 sentences out of 86882\n",
      "Processed 24000 sentences out of 86882\n",
      "Processed 25000 sentences out of 86882\n",
      "Processed 26000 sentences out of 86882\n",
      "Processed 27000 sentences out of 86882\n",
      "Processed 28000 sentences out of 86882\n",
      "Processed 29000 sentences out of 86882\n",
      "Processed 30000 sentences out of 86882\n",
      "Processed 31000 sentences out of 86882\n",
      "Processed 32000 sentences out of 86882\n",
      "Processed 33000 sentences out of 86882\n",
      "Processed 34000 sentences out of 86882\n",
      "Processed 35000 sentences out of 86882\n",
      "Processed 36000 sentences out of 86882\n",
      "Processed 37000 sentences out of 86882\n",
      "Processed 38000 sentences out of 86882\n",
      "Processed 39000 sentences out of 86882\n",
      "Processed 40000 sentences out of 86882\n",
      "Processed 41000 sentences out of 86882\n",
      "Processed 42000 sentences out of 86882\n",
      "Processed 43000 sentences out of 86882\n",
      "Processed 44000 sentences out of 86882\n",
      "Processed 45000 sentences out of 86882\n",
      "Processed 46000 sentences out of 86882\n",
      "Processed 47000 sentences out of 86882\n",
      "Processed 48000 sentences out of 86882\n",
      "Processed 49000 sentences out of 86882\n",
      "Processed 50000 sentences out of 86882\n",
      "Processed 51000 sentences out of 86882\n",
      "Processed 52000 sentences out of 86882\n",
      "Processed 53000 sentences out of 86882\n",
      "Processed 54000 sentences out of 86882\n",
      "Processed 55000 sentences out of 86882\n",
      "Processed 56000 sentences out of 86882\n",
      "Processed 57000 sentences out of 86882\n",
      "Processed 58000 sentences out of 86882\n",
      "Processed 59000 sentences out of 86882\n",
      "Processed 60000 sentences out of 86882\n",
      "Processed 61000 sentences out of 86882\n",
      "Processed 62000 sentences out of 86882\n",
      "Processed 63000 sentences out of 86882\n",
      "Processed 64000 sentences out of 86882\n",
      "Processed 65000 sentences out of 86882\n",
      "Processed 66000 sentences out of 86882\n",
      "Processed 67000 sentences out of 86882\n",
      "Processed 68000 sentences out of 86882\n",
      "Processed 69000 sentences out of 86882\n",
      "Processed 70000 sentences out of 86882\n",
      "Processed 71000 sentences out of 86882\n",
      "Processed 72000 sentences out of 86882\n",
      "Processed 73000 sentences out of 86882\n",
      "Processed 74000 sentences out of 86882\n",
      "Processed 75000 sentences out of 86882\n",
      "Processed 76000 sentences out of 86882\n",
      "Processed 77000 sentences out of 86882\n",
      "Processed 78000 sentences out of 86882\n",
      "Processed 79000 sentences out of 86882\n",
      "Processed 80000 sentences out of 86882\n",
      "Processed 81000 sentences out of 86882\n",
      "Processed 82000 sentences out of 86882\n",
      "Processed 83000 sentences out of 86882\n",
      "Processed 84000 sentences out of 86882\n",
      "Processed 85000 sentences out of 86882\n",
      "Processed 86000 sentences out of 86882\n",
      "Processed 86882 sentences out of 86882\n"
     ]
    }
   ],
   "source": [
    "# Calculating sentiment scores\n",
    "\n",
    "batch_size = 1000\n",
    "master_df[\"sentiment\"] = None\n",
    "\n",
    "# Loop over the dataframe in batches\n",
    "for start in range(0, len(master_df), batch_size):\n",
    "    end = min(start + batch_size, len(master_df))\n",
    "    batch = master_df.iloc[start:end]  # Select a batch of rows\n",
    "\n",
    "    # Process each row in the batch\n",
    "    for index, row in batch.iterrows():\n",
    "        text = row[\"text\"]\n",
    "        score = sentiment_score(text)\n",
    "        master_df.at[index, \"sentiment\"] = score\n",
    "\n",
    "    # Print progress update after each batch\n",
    "    print(f\"Processed {end} sentences out of {len(master_df)}\")\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6bffd3d1-8374-493a-b882-6f4876093253",
   "metadata": {},
   "source": [
    "# Gendered Pronouns #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "20cea4cb-3992-44d4-a206-f26f7002495a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 10000 sentences out of 86882\n",
      "Processed 20000 sentences out of 86882\n",
      "Processed 30000 sentences out of 86882\n",
      "Processed 40000 sentences out of 86882\n",
      "Processed 50000 sentences out of 86882\n",
      "Processed 60000 sentences out of 86882\n",
      "Processed 70000 sentences out of 86882\n",
      "Processed 80000 sentences out of 86882\n",
      "Processed 86882 sentences out of 86882\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>category</th>\n",
       "      <th>author</th>\n",
       "      <th>book/prompt</th>\n",
       "      <th>model</th>\n",
       "      <th>nation</th>\n",
       "      <th>gender</th>\n",
       "      <th>race</th>\n",
       "      <th>mean_sen_len</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>male_pronouns</th>\n",
       "      <th>female_pronouns</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>_Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>10.785714</td>\n",
       "      <td>0.913496</td>\n",
       "      <td>0.006803</td>\n",
       "      <td>0.047619</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>asked Lucia. \"Leave the house while she is in ...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>16.0</td>\n",
       "      <td>0.973164</td>\n",
       "      <td>0.016529</td>\n",
       "      <td>0.057851</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>She is a nice person, I dare say, and when onc...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>14.6</td>\n",
       "      <td>-0.986233</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.071429</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                               text   category  author  \\\n",
       "0   1  _Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...  authentic  alcott   \n",
       "1   2  asked Lucia. \"Leave the house while she is in ...  authentic  alcott   \n",
       "2   3  She is a nice person, I dare say, and when onc...  authentic  alcott   \n",
       "\n",
       "     book/prompt      model    nation  gender   race mean_sen_len sentiment  \\\n",
       "0  behind a mask  authentic  American  female  white    10.785714  0.913496   \n",
       "1  behind a mask  authentic  American  female  white         16.0  0.973164   \n",
       "2  behind a mask  authentic  American  female  white         14.6 -0.986233   \n",
       "\n",
       "   male_pronouns  female_pronouns  \n",
       "0       0.006803         0.047619  \n",
       "1       0.016529         0.057851  \n",
       "2       0.000000         0.071429  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Relative frequency male and female pronouns\n",
    "\n",
    "master_df[\"male_pronouns\"] = 0\n",
    "master_df[\"female_pronouns\"] = 0\n",
    "\n",
    "male_pronouns = [\"him\", \"his\", \"he\", \"himself\"]\n",
    "female_pronouns = [\"her\", \"hers\", \"she\", \"herself\"]\n",
    "\n",
    "batch_size = 10000\n",
    "\n",
    "# Process in batches\n",
    "for start in range(0, len(master_df), batch_size):\n",
    "    end = min(start + batch_size, len(master_df))\n",
    "    batch = master_df.iloc[start:end]\n",
    "\n",
    "    # Tokenize, lowercase and remove non alphabetic tokens\n",
    "    for index, row in batch.iterrows():\n",
    "        text = row[\"text\"]\n",
    "        tokens = word_tokenize(text)\n",
    "        tokens = [token.lower() for token in tokens if token.isalpha()]  # Lowercase and filter for words\n",
    "        \n",
    "        # Calculate male pronoun relative frequency\n",
    "        male_count = sum(tokens.count(pronoun) for pronoun in male_pronouns)\n",
    "        master_df.at[index, \"male_pronouns\"] = male_count / len(tokens) if tokens else 0\n",
    "        \n",
    "        # Calculate female pronoun relative frequency\n",
    "        female_count = sum(tokens.count(pronoun) for pronoun in female_pronouns)\n",
    "        master_df.at[index, \"female_pronouns\"] = female_count / len(tokens) if tokens else 0\n",
    "\n",
    "    # Print progress update\n",
    "    print(f\"Processed {end} sentences out of {len(master_df)}\")\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c815c78d-9451-421a-b609-70e9a9b98b07",
   "metadata": {},
   "source": [
    "# Type-Token Ratio #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "70aa151d-8e6e-4657-9aed-fc05db412fe9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 10000 sentences out of 86882\n",
      "Processed 20000 sentences out of 86882\n",
      "Processed 30000 sentences out of 86882\n",
      "Processed 40000 sentences out of 86882\n",
      "Processed 50000 sentences out of 86882\n",
      "Processed 60000 sentences out of 86882\n",
      "Processed 70000 sentences out of 86882\n",
      "Processed 80000 sentences out of 86882\n",
      "Processed 86882 sentences out of 86882\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>category</th>\n",
       "      <th>author</th>\n",
       "      <th>book/prompt</th>\n",
       "      <th>model</th>\n",
       "      <th>nation</th>\n",
       "      <th>gender</th>\n",
       "      <th>race</th>\n",
       "      <th>mean_sen_len</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>male_pronouns</th>\n",
       "      <th>female_pronouns</th>\n",
       "      <th>TTR</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>_Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>10.785714</td>\n",
       "      <td>0.913496</td>\n",
       "      <td>0.006803</td>\n",
       "      <td>0.047619</td>\n",
       "      <td>0.700680</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>asked Lucia. \"Leave the house while she is in ...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>16.0</td>\n",
       "      <td>0.973164</td>\n",
       "      <td>0.016529</td>\n",
       "      <td>0.057851</td>\n",
       "      <td>0.710744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>She is a nice person, I dare say, and when onc...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>14.6</td>\n",
       "      <td>-0.986233</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.071429</td>\n",
       "      <td>0.635714</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                               text   category  author  \\\n",
       "0   1  _Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...  authentic  alcott   \n",
       "1   2  asked Lucia. \"Leave the house while she is in ...  authentic  alcott   \n",
       "2   3  She is a nice person, I dare say, and when onc...  authentic  alcott   \n",
       "\n",
       "     book/prompt      model    nation  gender   race mean_sen_len sentiment  \\\n",
       "0  behind a mask  authentic  American  female  white    10.785714  0.913496   \n",
       "1  behind a mask  authentic  American  female  white         16.0  0.973164   \n",
       "2  behind a mask  authentic  American  female  white         14.6 -0.986233   \n",
       "\n",
       "   male_pronouns  female_pronouns       TTR  \n",
       "0       0.006803         0.047619  0.700680  \n",
       "1       0.016529         0.057851  0.710744  \n",
       "2       0.000000         0.071429  0.635714  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Calculating type-token ratio\n",
    "\n",
    "# Initialize the TTR column\n",
    "master_df[\"TTR\"] = 0\n",
    "\n",
    "batch_size = 10000\n",
    "\n",
    "# Process in batches\n",
    "for start in range(0, len(master_df), batch_size):\n",
    "    end = min(start + batch_size, len(master_df))\n",
    "    batch = master_df.iloc[start:end]\n",
    "    \n",
    "    for index, row in batch.iterrows():\n",
    "        text = row[\"text\"]\n",
    "        tokens = word_tokenize(text)\n",
    "        tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words\n",
    "        \n",
    "        types = set(tokens)\n",
    "        total_tokens = len(tokens)\n",
    "        ttr = len(types) / total_tokens if total_tokens > 0 else 0  # Avoid division by zero\n",
    "        master_df.at[index, \"TTR\"] = ttr\n",
    "\n",
    "    # Print progress update after each batch\n",
    "    print(f\"Processed {end} sentences out of {len(master_df)}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bdafdd9-b7d4-449b-b412-9e5e9854f7c2",
   "metadata": {},
   "source": [
    "# Lexical Density #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "9d012e0a-b145-4ca0-8b14-4cd68b070bbb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     /home/claudiac/nltk_data...\n",
      "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
      "[nltk_data]       date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 10000 sentences out of 86882\n",
      "Processed 20000 sentences out of 86882\n",
      "Processed 30000 sentences out of 86882\n",
      "Processed 40000 sentences out of 86882\n",
      "Processed 50000 sentences out of 86882\n",
      "Processed 60000 sentences out of 86882\n",
      "Processed 70000 sentences out of 86882\n",
      "Processed 80000 sentences out of 86882\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>category</th>\n",
       "      <th>author</th>\n",
       "      <th>book/prompt</th>\n",
       "      <th>model</th>\n",
       "      <th>nation</th>\n",
       "      <th>gender</th>\n",
       "      <th>race</th>\n",
       "      <th>mean_sen_len</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>male_pronouns</th>\n",
       "      <th>female_pronouns</th>\n",
       "      <th>TTR</th>\n",
       "      <th>lex_density</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>_Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>10.785714</td>\n",
       "      <td>0.913496</td>\n",
       "      <td>0.006803</td>\n",
       "      <td>0.047619</td>\n",
       "      <td>0.700680</td>\n",
       "      <td>0.544218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>asked Lucia. \"Leave the house while she is in ...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>16.0</td>\n",
       "      <td>0.973164</td>\n",
       "      <td>0.016529</td>\n",
       "      <td>0.057851</td>\n",
       "      <td>0.710744</td>\n",
       "      <td>0.561983</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>She is a nice person, I dare say, and when onc...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>14.6</td>\n",
       "      <td>-0.986233</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.071429</td>\n",
       "      <td>0.635714</td>\n",
       "      <td>0.600000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                               text   category  author  \\\n",
       "0   1  _Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...  authentic  alcott   \n",
       "1   2  asked Lucia. \"Leave the house while she is in ...  authentic  alcott   \n",
       "2   3  She is a nice person, I dare say, and when onc...  authentic  alcott   \n",
       "\n",
       "     book/prompt      model    nation  gender   race mean_sen_len sentiment  \\\n",
       "0  behind a mask  authentic  American  female  white    10.785714  0.913496   \n",
       "1  behind a mask  authentic  American  female  white         16.0  0.973164   \n",
       "2  behind a mask  authentic  American  female  white         14.6 -0.986233   \n",
       "\n",
       "   male_pronouns  female_pronouns       TTR  lex_density  \n",
       "0       0.006803         0.047619  0.700680     0.544218  \n",
       "1       0.016529         0.057851  0.710744     0.561983  \n",
       "2       0.000000         0.071429  0.635714     0.600000  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Calculating lexical density\n",
    "\n",
    "master_df[\"lex_density\"] = 0\n",
    "\n",
    "\n",
    "\n",
    "for index, row in master_df.iterrows():\n",
    "    text = row[\"text\"]\n",
    "    tokens = word_tokenize(text)\n",
    "    tokens = [token.lower() for token in tokens if token.isalpha()]  # Tokenize and filter alphabetic words\n",
    "    pos_tags = pos_tag(tokens)\n",
    "    lexical_pos = {'NN', 'NNS', 'NNP', 'NNPS',  # Nouns\n",
    "                   'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',  # Verbs\n",
    "                   'JJ', 'JJR', 'JJS',  # Adjectives\n",
    "                   'RB', 'RBR', 'RBS'}  # Adverbs\n",
    "    \n",
    "    lexical_words = [word for word, pos in pos_tags if pos in lexical_pos]\n",
    "    total_words = len(tokens)\n",
    "    lexical_density = len(lexical_words) / total_words\n",
    "    master_df.at[index, \"lex_density\"] = lexical_density\n",
    "\n",
    "    if (index + 1) % 10000 == 0:\n",
    "        print(f\"Processed {index + 1} sentences out of {len(master_df)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bfe7578-1bc1-4936-9457-9cce4eb36062",
   "metadata": {},
   "source": [
    "# Relative Frequency of High Frequency Function Words #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "21b3c34e-b4aa-49bf-b031-62e10bdac91b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['its',\n",
       " 'during',\n",
       " 'between',\n",
       " 'how',\n",
       " 'see',\n",
       " 'be',\n",
       " \"isn't\",\n",
       " 'off',\n",
       " 'must',\n",
       " 'but',\n",
       " \"couldn't\",\n",
       " 'ours',\n",
       " 'a',\n",
       " 'about',\n",
       " 'all',\n",
       " 'any',\n",
       " 'i',\n",
       " 'our',\n",
       " 'here',\n",
       " \"aren't\",\n",
       " 'and',\n",
       " 'ourselves',\n",
       " 'itself',\n",
       " 'on',\n",
       " 'under',\n",
       " 'one',\n",
       " \"you'll\",\n",
       " 'too',\n",
       " 'this',\n",
       " 'after',\n",
       " 'then',\n",
       " 'should',\n",
       " \"that'll\",\n",
       " 'me',\n",
       " 'why',\n",
       " 'your',\n",
       " 'until',\n",
       " \"won't\",\n",
       " 'further',\n",
       " 'you',\n",
       " \"needn't\",\n",
       " \"mightn't\",\n",
       " 'they',\n",
       " \"don't\",\n",
       " 'each',\n",
       " 'same',\n",
       " 'had',\n",
       " 'just',\n",
       " \"wouldn't\",\n",
       " 'my',\n",
       " 'into',\n",
       " 'that',\n",
       " 'are',\n",
       " \"you've\",\n",
       " 'than',\n",
       " 'do',\n",
       " 'as',\n",
       " 'the',\n",
       " 'them',\n",
       " 'there',\n",
       " 'does',\n",
       " 'some',\n",
       " 'themselves',\n",
       " \"weren't\",\n",
       " \"you'd\",\n",
       " 'through',\n",
       " 'below',\n",
       " 'in',\n",
       " 'don',\n",
       " \"wasn't\",\n",
       " 'an',\n",
       " 'were',\n",
       " 'for',\n",
       " 'has',\n",
       " 'very',\n",
       " 'before',\n",
       " 'or',\n",
       " 'what',\n",
       " \"hadn't\",\n",
       " \"doesn't\",\n",
       " \"shan't\",\n",
       " 'having',\n",
       " 'no',\n",
       " 'not',\n",
       " 'well',\n",
       " 'will',\n",
       " 'over',\n",
       " 'which',\n",
       " 'yourselves',\n",
       " 'once',\n",
       " 'am',\n",
       " 'above',\n",
       " 'of',\n",
       " 'other',\n",
       " \"it's\",\n",
       " 'is',\n",
       " 'have',\n",
       " 'much',\n",
       " 'out',\n",
       " 'would',\n",
       " 'by',\n",
       " 'again',\n",
       " \"hasn't\",\n",
       " 'myself',\n",
       " 'down',\n",
       " 'could',\n",
       " 'theirs',\n",
       " 'from',\n",
       " 'while',\n",
       " 'with',\n",
       " 'who',\n",
       " 'against',\n",
       " 'doing',\n",
       " \"you're\",\n",
       " \"mustn't\",\n",
       " 'it',\n",
       " \"shouldn't\",\n",
       " \"haven't\",\n",
       " 'most',\n",
       " 'whom',\n",
       " 'can',\n",
       " 'at',\n",
       " 'been',\n",
       " 'those',\n",
       " 'being',\n",
       " 'when',\n",
       " 'where',\n",
       " 'their',\n",
       " 'was',\n",
       " 'never',\n",
       " 'nor',\n",
       " 'these',\n",
       " 'did',\n",
       " 'we',\n",
       " 'such',\n",
       " 'because',\n",
       " 'up',\n",
       " 'few',\n",
       " 'more',\n",
       " 'made',\n",
       " 'yours',\n",
       " \"should've\",\n",
       " 'go',\n",
       " 'to',\n",
       " 'yourself',\n",
       " 'only',\n",
       " 'so',\n",
       " 'might',\n",
       " 'own',\n",
       " 'now',\n",
       " 'if',\n",
       " 'upon',\n",
       " \"didn't\",\n",
       " 'both',\n",
       " 'say',\n",
       " 'said',\n",
       " 'ask',\n",
       " 'asked',\n",
       " 'reply ',\n",
       " 'replied']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.\n"
     ]
    }
   ],
   "source": [
    "top_stops = open(\"data/top_stops_new.txt\").read().splitlines()\n",
    "top_stops"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "33fe7328-395f-46d5-b9ef-c67e60d51de2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 10000 sentences out of 86882\n",
      "Processed 20000 sentences out of 86882\n",
      "Processed 30000 sentences out of 86882\n",
      "Processed 40000 sentences out of 86882\n",
      "Processed 50000 sentences out of 86882\n",
      "Processed 60000 sentences out of 86882\n",
      "Processed 70000 sentences out of 86882\n",
      "Processed 80000 sentences out of 86882\n",
      "Processed 86882 sentences out of 86882\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>category</th>\n",
       "      <th>author</th>\n",
       "      <th>book/prompt</th>\n",
       "      <th>model</th>\n",
       "      <th>nation</th>\n",
       "      <th>gender</th>\n",
       "      <th>race</th>\n",
       "      <th>mean_sen_len</th>\n",
       "      <th>...</th>\n",
       "      <th>if</th>\n",
       "      <th>upon</th>\n",
       "      <th>didn't</th>\n",
       "      <th>both</th>\n",
       "      <th>say</th>\n",
       "      <th>said</th>\n",
       "      <th>ask</th>\n",
       "      <th>asked</th>\n",
       "      <th>reply</th>\n",
       "      <th>replied</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>_Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>10.785714</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.006803</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>asked Lucia. \"Leave the house while she is in ...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>16.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.016529</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.008264</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.008264</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>She is a nice person, I dare say, and when onc...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>14.6</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.007143</td>\n",
       "      <td>0.007143</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>\"Too late, Bella, the train was in some time a...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>13.272727</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.007042</td>\n",
       "      <td>0.007042</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>We'll stand by poor little Muir, won't we?\" An...</td>\n",
       "      <td>authentic</td>\n",
       "      <td>alcott</td>\n",
       "      <td>behind a mask</td>\n",
       "      <td>authentic</td>\n",
       "      <td>American</td>\n",
       "      <td>female</td>\n",
       "      <td>white</td>\n",
       "      <td>18.125</td>\n",
       "      <td>...</td>\n",
       "      <td>0.007143</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 176 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                               text   category  author  \\\n",
       "0   1  _Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...  authentic  alcott   \n",
       "1   2  asked Lucia. \"Leave the house while she is in ...  authentic  alcott   \n",
       "2   3  She is a nice person, I dare say, and when onc...  authentic  alcott   \n",
       "3   4  \"Too late, Bella, the train was in some time a...  authentic  alcott   \n",
       "4   5  We'll stand by poor little Muir, won't we?\" An...  authentic  alcott   \n",
       "\n",
       "     book/prompt      model    nation  gender   race mean_sen_len  ...  \\\n",
       "0  behind a mask  authentic  American  female  white    10.785714  ...   \n",
       "1  behind a mask  authentic  American  female  white         16.0  ...   \n",
       "2  behind a mask  authentic  American  female  white         14.6  ...   \n",
       "3  behind a mask  authentic  American  female  white    13.272727  ...   \n",
       "4  behind a mask  authentic  American  female  white       18.125  ...   \n",
       "\n",
       "         if  upon  didn't  both       say      said  ask     asked  reply   \\\n",
       "0  0.000000   0.0     0.0   0.0  0.000000  0.000000  0.0  0.006803     0.0   \n",
       "1  0.016529   0.0     0.0   0.0  0.008264  0.000000  0.0  0.008264     0.0   \n",
       "2  0.000000   0.0     0.0   0.0  0.007143  0.007143  0.0  0.000000     0.0   \n",
       "3  0.000000   0.0     0.0   0.0  0.007042  0.007042  0.0  0.000000     0.0   \n",
       "4  0.007143   0.0     0.0   0.0  0.000000  0.000000  0.0  0.000000     0.0   \n",
       "\n",
       "   replied  \n",
       "0      0.0  \n",
       "1      0.0  \n",
       "2      0.0  \n",
       "3      0.0  \n",
       "4      0.0  \n",
       "\n",
       "[5 rows x 176 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Define batch size\n",
    "batch_size = 10000\n",
    "\n",
    "# Process in batches\n",
    "for start in range(0, len(master_df), batch_size):\n",
    "    end = min(start + batch_size, len(master_df))\n",
    "    batch = master_df.iloc[start:end]\n",
    "    \n",
    "    for index, row in batch.iterrows():\n",
    "        text = row[\"text\"]\n",
    "        tokens = word_tokenize(text)\n",
    "        tokens = [token.lower() for token in tokens if token.isalpha()] \n",
    "        \n",
    "        for x in top_stops:\n",
    "            count_x = tokens.count(x) \n",
    "            rel_freq = count_x / len(tokens) \n",
    "            master_df.at[index, x] = rel_freq  \n",
    "\n",
    "    print(f\"Processed {end} sentences out of {len(master_df)}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "67f90af1-e35a-46f8-b4a7-76320f2bf18c",
   "metadata": {},
   "source": [
    "# Relative Frequencies of Parts of Speech #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "0a05d8dd-989c-4333-a607-4d3caa1a2710",
   "metadata": {},
   "outputs": [],
   "source": [
    "# List of all POS tags\n",
    "all_pos_tags = [\n",
    "    'NN', 'NNS', 'NNP', 'NNPS',  # Nouns\n",
    "    'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',  # Verbs\n",
    "    'JJ', 'JJR', 'JJS',  # Adjectives\n",
    "    'RB', 'RBR', 'RBS',  # Adverbs\n",
    "    'PRP', 'PRP$', 'IN', 'DT',  # Pronouns, prepositions, determiners\n",
    "    # Add more tags as needed\n",
    "]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "b632d6e0-7501-4fd0-959a-ddf0a118bb3e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 10000 texts out of 86882\n",
      "Processed 20000 texts out of 86882\n",
      "Processed 30000 texts out of 86882\n",
      "Processed 40000 texts out of 86882\n",
      "Processed 50000 texts out of 86882\n",
      "Processed 60000 texts out of 86882\n",
      "Processed 70000 texts out of 86882\n",
      "Processed 80000 texts out of 86882\n",
      "   id                                               text   category  author  \\\n",
      "0   1  _Chapter I_   JEAN MUIR  \"Has she come?\" \"No, ...  authentic  alcott   \n",
      "1   2  asked Lucia. \"Leave the house while she is in ...  authentic  alcott   \n",
      "2   3  She is a nice person, I dare say, and when onc...  authentic  alcott   \n",
      "\n",
      "     book/prompt      model    nation  gender   race mean_sen_len  ...  \\\n",
      "0  behind a mask  authentic  American  female  white    10.785714  ...   \n",
      "1  behind a mask  authentic  American  female  white         16.0  ...   \n",
      "2  behind a mask  authentic  American  female  white         14.6  ...   \n",
      "\n",
      "         JJ       JJR  JJS        RB       RBR  RBS       PRP      PRP$  \\\n",
      "0  0.122449  0.000000  0.0  0.047619  0.000000  0.0  0.074830  0.040816   \n",
      "1  0.066116  0.016529  0.0  0.057851  0.008264  0.0  0.107438  0.024793   \n",
      "2  0.092857  0.000000  0.0  0.100000  0.000000  0.0  0.121429  0.050000   \n",
      "\n",
      "         IN        DT  \n",
      "0  0.142857  0.115646  \n",
      "1  0.107438  0.082645  \n",
      "2  0.014286  0.057143  \n",
      "\n",
      "[3 rows x 196 columns]\n"
     ]
    }
   ],
   "source": [
    "# Initialize columns for POS frequencies\n",
    "for pos in all_pos_tags:\n",
    "    master_df[pos] = 0.0\n",
    "\n",
    "# Calculate relative frequencies of POS tags\n",
    "for index, row in master_df.iterrows():\n",
    "    text = row[\"text\"]\n",
    "    tokens = word_tokenize(text)  # Tokenize text\n",
    "    tokens = [token.lower() for token in tokens if token.isalpha()]  # Filter alphabetic words\n",
    "    pos_tags = pos_tag(tokens)  # Get POS tags\n",
    "    \n",
    "    # Count occurrences of each POS tag\n",
    "    pos_counts = Counter(tag for _, tag in pos_tags if tag in all_pos_tags)\n",
    "    \n",
    "    # Calculate relative frequencies\n",
    "    total_tokens = len(tokens)\n",
    "    for pos, count in pos_counts.items():\n",
    "        master_df.at[index, pos] = count / total_tokens if total_tokens > 0 else 0\n",
    "\n",
    "    # Progress report (optional)\n",
    "    if (index + 1) % 10000 == 0:\n",
    "        print(f\"Processed {index + 1} texts out of {len(master_df)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "f2bbc4fd-a6b1-4ac2-975c-7e21fe043f5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "master_df.to_csv(\"data/master_feature_matrix.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ff28a1c-e633-4260-abc4-b82b707fe26a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
